#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from __future__ import annotations

import argparse
import json
import re
from pathlib import Path

import numpy as np
import pandas as pd


DEFAULT_KEYWORDS = [
    "双碳",
    "碳达峰",
    "碳中和",
    "碳交易",
    "碳市场",
    "全国碳市场",
    "温室气体",
    "减排",
    "低碳",
    "零碳",
    "绿色低碳",
    "绿色转型",
    "能耗双控",
]

STRICT_KEYWORDS = [
    "双碳",
    "碳达峰",
    "碳中和",
    "碳交易",
    "碳市场",
    "全国碳市场",
    "温室气体",
    "能耗双控",
]


def load_config() -> dict:
    cfg_path = Path("media_project/config.json")
    if not cfg_path.exists():
        cfg_path = Path("media_project/config.example.json")
    return json.loads(cfg_path.read_text(encoding="utf-8"))


def normalize_code6(s: pd.Series) -> pd.Series:
    out = s.astype("string").str.strip()
    out = out.replace({"": pd.NA, "nan": pd.NA, "None": pd.NA})
    out = out.str.replace(r"\.0$", "", regex=True)
    out = out.str.extract(r"(\d+)", expand=False)
    out = out.str.zfill(6)
    return out


def build_regex(keywords: list[str]) -> re.Pattern:
    pat = "|".join(re.escape(k) for k in keywords if k and str(k).strip())
    return re.compile(pat)


def month_from_date_series(date_series: pd.Series) -> pd.Series:
    s = date_series.astype(str)
    parts = s.str.extract(r"(?P<y>\d{4})[-/](?P<m>\d{1,2})[-/](?P<d>\d{1,2})")
    month = parts["y"] + "-" + parts["m"].str.zfill(2)
    month = month.where(month.str.match(r"^\d{4}-\d{2}$"), other=np.nan)
    return month


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--text-field", choices=["标题", "全文", "both"], default=None)
    ap.add_argument("--keyword-mode", choices=["default", "strict"], default="default")
    ap.add_argument("--keywords", default=None, help="Comma-separated keyword override (bypasses keyword-mode/config).")
    ap.add_argument("--start-month", default=None)
    ap.add_argument("--out", default=None, help="Output CSV path (defaults to media_project/out/gov_wechat_lowcarbon_city_month.csv)")
    args = ap.parse_args()

    cfg = load_config()
    src = cfg["gov_wechat_csv"]
    out_dir = Path(cfg.get("out_dir", "media_project/out"))
    out_dir.mkdir(parents=True, exist_ok=True)

    keywords = cfg.get("lowcarbon_keywords", DEFAULT_KEYWORDS)
    if args.keyword_mode == "strict":
        keywords = STRICT_KEYWORDS
    if args.keywords:
        keywords = [k.strip() for k in args.keywords.split(",") if k.strip()]
    text_field = args.text_field or cfg.get("gov_wechat_text_field", "标题")  # 标题 / 全文 / both
    start_month = args.start_month or cfg.get("start_month", "2013-01")
    chunksize = int(cfg.get("chunksize", 200_000))

    rx = build_regex(list(keywords))

    usecols = ["发布日期", "城市代码"]
    if text_field == "both":
        usecols += ["标题", "全文"]
    else:
        usecols += [text_field]

    total_by_city_month: dict[tuple[str, str], int] = {}
    hit_by_city_month: dict[tuple[str, str], int] = {}

    reader = pd.read_csv(
        src,
        encoding="utf-8-sig",
        usecols=usecols,
        chunksize=chunksize,
        dtype=str,
        on_bad_lines="skip",
        low_memory=True,
    )

    for chunk in reader:
        chunk = chunk.dropna(subset=["发布日期", "城市代码"])
        city_code6 = normalize_code6(chunk["城市代码"])
        month = month_from_date_series(chunk["发布日期"])
        chunk = chunk.assign(_city=city_code6, _month=month).dropna(subset=["_city", "_month"])
        chunk = chunk[chunk["_month"] >= start_month]
        if chunk.empty:
            continue

        if text_field == "both":
            text = (chunk["标题"].fillna("") + " " + chunk["全文"].fillna("")).astype(str)
        else:
            text = chunk[text_field].fillna("").astype(str)

        hit = text.str.contains(rx)
        tmp = pd.DataFrame(
            {
                "city_code6": chunk["_city"].astype(str),
                "month": chunk["_month"].astype(str),
                "hit": hit.astype("int8"),
            }
        )
        g = tmp.groupby(["city_code6", "month"], as_index=False).agg(docs_total=("hit", "size"), docs_hit=("hit", "sum"))

        for row in g.itertuples(index=False):
            key = (row.city_code6, row.month)
            total_by_city_month[key] = total_by_city_month.get(key, 0) + int(row.docs_total)
            hit_by_city_month[key] = hit_by_city_month.get(key, 0) + int(row.docs_hit)

    rows = []
    for (city, m), total in total_by_city_month.items():
        hit_n = hit_by_city_month.get((city, m), 0)
        rows.append({"city_code6": city, "month": m, "docs_total": total, "docs_hit": hit_n})

    df = pd.DataFrame(rows)
    if df.empty:
        out_path = out_dir / "gov_wechat_lowcarbon_city_month.csv"
        df.to_csv(out_path, index=False, encoding="utf-8-sig")
        print(f"Wrote {out_path} rows=0 (no matches)")
        return

    df = df.sort_values(["city_code6", "month"], kind="mergesort")
    df["topic_intensity"] = df["docs_hit"] / df["docs_total"].replace({0: np.nan})

    out_path = Path(args.out) if args.out else (out_dir / "gov_wechat_lowcarbon_city_month.csv")
    df.to_csv(out_path, index=False, encoding="utf-8-sig")
    print(f"Wrote {out_path} rows={len(df)} field={text_field} start_month={start_month}")
    print("Note: script only reads raw files and writes to out_dir; it does not modify the source archives.")


if __name__ == "__main__":
    main()
